########Data Load#############
#load package: foreign
#load data
data = read.dta("C://R/Replication/Data/kocc83_occ80.dta")
#dim(data) 306077     50

#get rid of data rows that have NA wage
data = data[!is.na(data$lnwage_impute1), ]
#282938     50

##############################
#Part 0: Data preparation
##############################

#get wages
lnwage = data$lnwage_impute1
n = length(lnwage)

#collect independent variables 
constant = rep(1, n, 1)
age = data$age
educ = data$gradeat
race = data$race
union = as.numeric(data$unionmme)
#turn union into a 0-1 dummy 
union[union == 2] = 0
#male = data$sex
#male[male == 2] = 0

#fill in dummy matrices (min & max age are 16 to 65, drop dummies that are all 0)
agebucket = ceiling(age/5)
agebucket = agebucket - min(agebucket) + 1
educbucket = educ + 1
age_dum = matrix(0, n, max(agebucket))
educ_dum = matrix(0, n, max(educbucket))
race_dum = matrix (0, n, max(race)) 

for (i in 1:n){
	age_dum[i, agebucket[i]] = 1
	educ_dum[i, educbucket[i]] = 1
	race_dum[i, race[i]] = 1
}

#put everything together (omit constant on X; add this later)
X = cbind(age_dum[,c(1:(ncol(age_dum) - 1))], educ_dum[, c(1:(ncol(educ_dum) - 1))], 
race_dum[,c(1:(ncol(race_dum) - 1))], union)
#dim(X) 282938     28

#compute averages by occupation 
avgY = aggregate(lnwage, list(occ = as.numeric(data$occ80)), mean)
avgX = aggregate(X, list(occ = as.numeric(data$occ80)), mean) #dim 488  32
numocc = as.numeric(data$occ80)
k = nrow(avgY)

#assign avg vectors 
Ya = rep(0, n)
Xa = matrix(0, n, ncol(avgX))
for (i in 1:k){
	#assign average
	index = which(numocc == avgY$occ[i])
	Ya[index] = avgY$x[i]
	for (j in 1:dim(avgX)[2]){
		Xa[index, j] = avgX[i, j]
	}
}
Xa = Xa[,c(2:ncol(Xa))]
#dim(Xa)   282938     28

##################################################
#Part 1: Variance decomposition (Replication)
##################################################

#compute differences
wagediff = data.matrix(lnwage - Ya)
Xdiff = data.matrix(X - Xa)

#run robust regression of wagediff on Xdiff to get beta estimates with white heteroskedastic errors
#R doesn't have a built in package like Stata for heteroskedastic robust std errors. 
#Since estimated coefficients are the same between heteroskedastic robust & homoskedastic OLS, 
#and std errors are not reported by the authors, I will use OLS to get coefficients.

result_OLS = lm(wagediff~Xdiff)
#############################################################################
#OLS Coefficients:

#(Intercept)	       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
#  8.708e-16   -3.194e-01   -2.553e-01   -1.588e-01   -6.892e-02   -8.729e-03  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
#  1.101e-02    3.597e-02    5.692e-02    3.774e-02   -7.941e-01   -8.129e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -8.125e-01   -7.826e-01   -7.675e-01   -7.452e-01   -7.352e-01   -7.387e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -6.320e-01   -5.618e-01   -5.447e-01   -5.124e-01   -3.460e-01   -1.511e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion  
# -4.644e-02    3.030e-02   -3.543e-02   -8.150e-03    2.370e-01  
#############################################################################
#############################################################################
#For comparison: outlier robust linear regression (RLM)
#result_rlm = rlm(wagediff~Xdiff)
#Converged in 4 iterations

#Coefficients:
# (Intercept)        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.002025455 -0.319886396 -0.255114574 -0.159196504 -0.069502764 -0.011198150 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
# 0.009775201  0.034095579  0.054294142  0.035119862 -0.805806403 -0.818911164 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.815545382 -0.783669753 -0.770708884 -0.747986008 -0.736666756 -0.740794474 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.634340524 -0.564935566 -0.548416070 -0.514420758 -0.350602169 -0.152422893 
#       Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion 
#-0.051649122  0.026884519 -0.037283125 -0.017165033  0.238070996 

#############################################################################

#############################################################################
#mean fixed effects
beta = data.matrix(result_OLS$coeff)
alphas = Ya - cbind(constant, Xa)%*%beta
#############################################################################

#Get delta estimates
#compute prediction errors
esq = (lnwage - cbind(constant, X)%*%beta - alphas)^2
#compute average prediction errors by occupation  
esq_occ = aggregate(esq, list(occ = as.numeric(data$occ80)), mean)
#assign avg vectors  
esqa = rep(0, nrow(esq_occ))

for (i in 1:nrow(esq_occ)){
	#assign average
	index = which(numocc == esq_occ$occ[i])
	esqa[index] = esq_occ$V1[i]
}
#compute differences
esqdiff = esq - esqa
result2_OLS = lm(esqdiff~Xdiff)


#############################################################################
#OLS Coefficients:
#(Intercept)        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
#  3.216e-17   -8.124e-02   -7.089e-02   -6.132e-02   -4.790e-02   -3.885e-02  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -3.045e-02   -3.067e-02   -2.088e-02   -1.480e-02   -6.344e-02   -7.780e-02  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -7.436e-02   -7.227e-02   -6.352e-02   -6.909e-02   -6.793e-02   -6.107e-02  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -5.995e-02   -4.533e-02   -5.289e-02   -5.953e-02   -1.198e-02    1.597e-04  
#      Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion  
#  8.032e-02   -1.000e-02   -8.959e-03    5.529e-03   -1.092e-02  
#############################################################################

#############################################################################
#variance fixed effects
psi = data.matrix(result2_OLS$coeff)
deltas = esqa -  cbind(constant, Xa)%*%psi
#############################################################################

#############################################################################
#Compute variance breakdown
mean_alpha = mean(na.omit(alphas))
#alpha_occ = na.omit(aggregate(alphas, list(occ = as.numeric(data$occ80)), mean))
#mean_alpha2 = colMeans(alpha_occ)[2]

mualphas = rep(mean_alpha, length(na.omit(alphas)))
between = sum((na.omit(alphas) - mualphas)^2)/length(na.omit(alphas))
within = sum(na.omit(deltas))/length(na.omit(deltas))
totalvar =  between + within 

#############################################################################


##################################################
#Part 2: ANOVA (Revision)
##################################################

#############################################################################

anova1 = aov(lnwage~numocc)
summary(anova1)
#                Df Sum Sq Mean Sq F value Pr(>F)    
#numocc           1   4662    4662   16210 <2e-16 ***
#Residuals   247409  71153       0                   

anova2 = aov(lnwage~numocc + X)
summary(anova2)
#                Df Sum Sq Mean Sq F value Pr(>F)    
#numocc           1   4662    4662   26434 <2e-16 ***
#X               31  27526     888    5035 <2e-16 ***
#Residuals   247378  43627       0  

anova3 = aov(lnwage~numocc + educ + age + race + union + male)
#                Df Sum Sq Mean Sq F value Pr(>F)    
#numocc           1   4662    4662   24393 <2e-16 ***
#educ             1   5609    5609   29347 <2e-16 ***
#age              1   7837    7837   41009 <2e-16 ***
#race             1    217     217    1134 <2e-16 ***
#union            1   3780    3780   19779 <2e-16 ***
#male             1   6428    6428   33635 <2e-16 ***
#Residuals   247404  47282       0   

##################################################
#Part 2: ANOVA (Revision)
##################################################
              
